Adapting methods described in the poster uploaded on Google Drive, I first calculated the mean pre-pubertal fE2 concentration. Because infants/juvs have elevated fE2 when nursing, we need to exclude these individuals when determining what typical pre-pubertal fE2 looks like.
Per Slack communication on 7/17: defining prebuteral E from individuals greater than 2.5 years (def not nursing) and under 3 (def not pubertal)
## calculating average pre-pubertal E2
prepubertal_data<-e_data |>
filter(age_at_sample >= 2.5 & age_at_sample <= 3) |> # we want age 2.5-3...def pre menstruation
select(ind_id, age_at_sample, e_conc_ug)
prepubertal_e_data <- prepubertal_data |>
summarise(mean_e_conc_ug = mean(e_conc_ug, na.rm = TRUE),
sd_e_conc_ug = sd(e_conc_ug, na.rm = TRUE))
mean_prepubertal_e_conc_ug <- prepubertal_e_data$mean_e_conc_ug
sd_prepubertal_e_conc_ug <- prepubertal_e_data$sd_e_conc_ug
Using the mean + 2SD approach:
calculated_treshold <- mean_prepubertal_e_conc_ug + 2*sd_prepubertal_e_conc_ug
print(paste("Calculated fE2 threshold for menarche: ", calculated_treshold, " ug/g"))
## [1] "Calculated fE2 threshold for menarche: 0.784032497466729 ug/g"
Looking at the average age at which the population has an E reading that crosses the threshold E. Restricting dataset to individuals >3 years old.
## creating df ----
e_data_age_at_menarche_df <- e_data |>
select(c(age_at_sample, age_at_sample_years, e_conc_ug)) |>
group_by(age_at_sample) |>
mutate(mean_e_at_age = mean(e_conc_ug, na.rm = TRUE),
se_e_at_age = sd(e_conc_ug, na.rm = TRUE) / sqrt(n())) |>
ungroup() |>
arrange(age_at_sample) |>
distinct() |>
group_by(age_at_sample_years) |>
mutate(mean_e_at_age_years = mean(e_conc_ug, na.rm = TRUE),
se_e_at_age_years = sd(e_conc_ug, na.rm = TRUE) / sqrt(n())) |>
ungroup() |>
select(-e_conc_ug) |>
arrange(age_at_sample_years) |>
distinct()
## average_age_at_menarche----
average_age_at_menarche <- e_data_age_at_menarche_df |>
filter(age_at_sample > 3) |>
filter(mean_e_at_age > calculated_treshold) |>
slice(1) |>
pull(age_at_sample)
# standard error of average_age_at_menarche
se_age_at_menarche <- e_data_age_at_menarche_df |>
filter(age_at_sample > 3) |>
filter(mean_e_at_age > calculated_treshold) |>
summarise(se_age_at_sample = sd(age_at_sample, na.rm = TRUE) / sqrt(n())) |>
pull(se_age_at_sample)
print(paste("Calculated average age at menarche: ", average_age_at_menarche, " years"))
## [1] "Calculated average age at menarche: 3.22 years"
print(paste("SE average age at menarche: ", se_age_at_menarche))
## [1] "SE average age at menarche: 0.343412350546469"
NOTE: This calculated age is definitely too young.
The E data is super not normal at the population-level, even after attempting log transformation….I am holding off on any real modeling for now
shapiro.test(e_data_age_at_menarche_df$mean_e_at_age)
##
## Shapiro-Wilk normality test
##
## data: e_data_age_at_menarche_df$mean_e_at_age
## W = 0.20031, p-value < 2.2e-16
shapiro.test(log(e_data_age_at_menarche_df$mean_e_at_age))
##
## Shapiro-Wilk normality test
##
## data: log(e_data_age_at_menarche_df$mean_e_at_age)
## W = 0.99535, p-value = 0.008119
For interpretability, this plot just has the data sorted into age-years, rather than exact age estimations with lots of decimals
## get sample size ----
n_figure_1 <- nrow(e_data)
##Figure 1
par(mfrow = c(1,1))
ggplot(data = e_data_age_at_menarche_df |>
select(age_at_sample_years, mean_e_at_age_years, se_e_at_age_years) |>
distinct(), aes(x = age_at_sample_years, y = mean_e_at_age_years)) +
geom_smooth() +
geom_point()+
geom_vline(xintercept = average_age_at_menarche, linetype = "dashed", color = "red") +
geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
geom_errorbar(aes(ymin = mean_e_at_age_years - se_e_at_age_years, ymax = mean_e_at_age_years + se_e_at_age_years), width = 0.2) +
geom_rect(aes(xmin = average_age_at_menarche - 1.96*se_age_at_menarche, xmax = average_age_at_menarche + 1.96*se_age_at_menarche,
ymin = -Inf, ymax = Inf), alpha = 0.05, fill = "pink") +
ggtitle(paste0("Average estradiol by age at sample (N = ", n_figure_1, " samples)"))+
xlab("Age at sample")+
ylab("Fecal estradiol (ug/g)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpreting the graph:
The black circles and error bars represent the average age fE2 reading (with standard error) for each age-year for the population
The blue line represents the results of a Loess regression, a non-parametric approach to modeling the relationship between x and y. The gray shading around the blue line is the error of the estimate.
The dashed light blue line is the calculated prepubertal fE threshold (y = 0.7840325 ug/g)
The dashed red line is the calculated average age at puberty (x = 3.22 years)
The shaded area around the dashed red line is the 95% CI for the average age at puberty estimate (3.22 ± 1.96* SE)
NOTE: I haven’t yet calculated individual E thresholds/ages at menarche because I think we are going to want a different approach than mean + 2sd….still working on that.
I am pulling data on females that we have good E coverage for before and after potential menarche…I’m looking for IDs with samples starting at least by age 3. At least 25 samples per ID. That leaves us with the following:
par(mfrow = c(1,1))
# Panel of all individual females who have good E coverage before and after menarche
females_for_menarche_panel<-e_data |>
group_by(ind_id) |>
filter(min(age_at_sample)<3) |>
mutate(count = n()) |>
ungroup() |>
select(ind_id,count) |>
arrange(desc(count)) |>
distinct() |>
filter(count>25) |>
pull(ind_id)
females_for_menarche_panel_df<- e_data |>
filter(ind_id %in% females_for_menarche_panel) |>
filter(age_at_sample < 8) |>
select(ind_id, age_at_sample, sample_id, date, month, time, hour, e_conc_ug, is_pregnant) |>
arrange(ind_id, age_at_sample)
n_figure_2 <-nrow(females_for_menarche_panel_df)
ggplot(data = females_for_menarche_panel_df, aes(x = age_at_sample, y = log(e_conc_ug), color = ind_id)) +
geom_point()+
facet_wrap(~ ind_id)+
ggtitle(paste0("Estradiol by age at sample, separated by individual (n = ", n_figure_2, " samples)"))+
xlab("Age at sample")+
ylab("Log-transformed fecal estradiol (log(ug/g))")+
theme(legend.position = "none")
^Note that the y-axis is log-transformed on this plot
shapiro.test(females_for_menarche_panel_df$e_conc_ug) #nope
##
## Shapiro-Wilk normality test
##
## data: females_for_menarche_panel_df$e_conc_ug
## W = 0.46254, p-value < 2.2e-16
shapiro.test(log(females_for_menarche_panel_df$e_conc_ug)) # log transformation does the trick!
##
## Shapiro-Wilk normality test
##
## data: log(females_for_menarche_panel_df$e_conc_ug)
## W = 0.99592, p-value = 0.1774
Log transformation does the trick! Data is now normal for our young female panel.
I am including individual ID as a random effect. In unadjusted model,
fixed effects are month and time of day (accounting for seasonality and
diurnal variation) with individual ID included as a random effect. Any
other things to control for?
The adjusted model includes age as a fixed effect. Age improves the
model fit!
unadjusted_e_model <- lmer(log(e_conc_ug) ~ + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
adjusted_e_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
anova(unadjusted_e_model, adjusted_e_model) # adjusted_e_model fits better
## refitting model(s) with ML (instead of REML)
## Data: females_for_menarche_panel_df
## Models:
## unadjusted_e_model: log(e_conc_ug) ~ +month + hour + (1 | ind_id)
## adjusted_e_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## unadjusted_e_model 5 1879.4 1900.8 -934.70 1869.4
## adjusted_e_model 6 1751.7 1777.5 -869.86 1739.7 129.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_e_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## Data: females_for_menarche_panel_df
##
## REML criterion at convergence: 1758.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.16124 -0.62234 0.00391 0.65935 2.72881
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.150 0.3874
## Residual 1.461 1.2087
## Number of obs: 537, groups: ind_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -2.78282 0.32894 -8.460
## age_at_sample 0.50977 0.04226 12.062
## month 0.08792 0.01492 5.894
## hour -0.04200 0.01934 -2.171
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month
## age_at_smpl -0.581
## month -0.257 0.022
## hour -0.616 -0.018 -0.031
#checking for multicollinearity -- low (~ 1)
vif(adjusted_e_model)
## age_at_sample month hour
## 1.000802 1.001467 1.001289
print("No multicollinearity -- yay!")
## [1] "No multicollinearity -- yay!"
Tldr: model passes diagnostics—yay
# checking model diagnostics
plot(adjusted_e_model) #resids vs leverage -- pretty good
lattice::qqmath(adjusted_e_model) ## qq norm plot -- good
plot(adjusted_e_model, # scale location plot -- pretty good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
NOTE: I am holding off on reporting on this until we have the updated behavioral data containing group scan observations of duck face….planning to adapt methods from Sen et al. 2022
NOTE: I am holding off on reporting on this until we have the updated behavioral data containing group scan observations of duck face….planning to adapt methods from Sen et al. 2022
Could you please share the estimated/confirmed ages of all of our IDs? I can’t really assign this easily (unless I work backwards from age at sample info….). I know I had made a Google Sheet at some point with this information, but I no longer have access. Thanks!
Haven’t looked into this yet…stay tuned.
QUESTION: I see some individuals are marekd as
pregnant (e..g, is_pregnant = “P”), but the
preg_trim column is blank. What do I do about this?
For now, I’m excluding pregnant individuals with missing trimester info.
pregnant_df<-e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, preg_trim, pregnancy_num, is_pregnant) |>
filter(is_pregnant == "P") |>
filter(preg_trim !="")
n_pregnant_df <- nrow(pregnant_df)
Note: y-axis is log-transformed
par(mfrow = c(1,1))
ggplot(data = pregnant_df, aes(x = preg_trim, y = log(e_conc_ug)))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and gestation (N = ", n_pregnant_df, "samples)")) +
xlab("Pregnancy trimester")+
ylab("Log-transformed fecal estradiol (log(ug/g))")
Data is normal following log transformation
shapiro.test(pregnant_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: pregnant_df$e_conc_ug
## W = 0.33877, p-value < 2.2e-16
shapiro.test(log(pregnant_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(pregnant_df$e_conc_ug)
## W = 0.99539, p-value = 0.6763
Among trimesters, we only see that T2 is significantly lower than T3; no significant difference between T1-T2 or T1-T3.
anova_result_pregancy_e <- aov(log(e_conc_ug) ~ preg_trim, data = pregnant_df)
summary(anova_result_pregancy_e)
## Df Sum Sq Mean Sq F value Pr(>F)
## preg_trim 2 17.7 8.853 5.964 0.00296 **
## Residuals 242 359.2 1.484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_results_pregnancy_e <- TukeyHSD(anova_result_pregancy_e)
print(tukey_results_pregnancy_e)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = log(e_conc_ug) ~ preg_trim, data = pregnant_df)
##
## $preg_trim
## diff lwr upr p adj
## T2-T1 -0.3732333 -0.8473855 0.1009188 0.1537996
## T3-T1 0.2486747 -0.2191212 0.7164707 0.4229352
## T3-T2 0.6219081 0.1957494 1.0480667 0.0019638
## what other control variables should I include?
unadjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour+ (1 | ind_id), data = pregnant_df)
adjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+preg_trim + (1 | ind_id), data = pregnant_df)
anova(unadjusted_pregnancy_model, adjusted_pregnancy_model) # adjusted model is better fit--pregnancy predicts E
## refitting model(s) with ML (instead of REML)
## Data: pregnant_df
## Models:
## unadjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_pregnancy_model 6 784.51 805.52 -386.25 772.51
## adjusted_pregnancy_model 8 775.48 803.49 -379.74 759.48 13.024 2
## Pr(>Chisq)
## unadjusted_pregnancy_model
## adjusted_pregnancy_model 0.001486 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_pregnancy_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 |
## ind_id)
## Data: pregnant_df
##
## REML criterion at convergence: 781.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.74934 -0.62059 0.01613 0.67608 3.01420
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.4587 0.6773
## Residual 1.1646 1.0792
## Number of obs: 245, groups: ind_id, 23
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.78746 0.52446 3.408
## age_at_sample 0.01562 0.02276 0.686
## month 0.02944 0.02363 1.246
## hour -0.07165 0.02894 -2.476
## preg_trimT2 -0.17724 0.20993 -0.844
## preg_trimT3 0.41154 0.19692 2.090
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour prg_T2
## age_at_smpl -0.538
## month -0.372 -0.015
## hour -0.655 0.008 0.030
## preg_trimT2 -0.293 -0.080 0.452 -0.019
## preg_trimT3 -0.311 -0.017 0.403 -0.019 0.660
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_pregnancy_model)
## GVIF Df GVIF^(1/(2*Df))
## age_at_sample 1.009100 1 1.004540
## month 1.291221 1 1.136319
## hour 1.002459 1 1.001229
## preg_trim 1.301658 2 1.068130
# checking model diagnostics
plot(adjusted_pregnancy_model) #resids vs leverage -- good
lattice::qqmath(adjusted_pregnancy_model) ## qq norm plot -- good...small tail on right
plot(adjusted_pregnancy_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Only looking at adults (ages >6)
lactation_df <- e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, lactating) |>
filter(age_at_sample > 6)
n_lactation_df<-nrow(lactation_df)
Note: y-axis is log-transformed
ggplot(data = lactation_df, aes(x = lactating, y = log(e_conc_ug)))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and lactation (N = ", n_pregnant_df, "samples)")) +
xlab("Lactating")+
ylab("Log-transformed fecal estradiol (log(ug/g))")
Data is still not normal following log transformation
shapiro.test(lactation_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: lactation_df$e_conc_ug
## W = 0.26856, p-value < 2.2e-16
shapiro.test(log(lactation_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(lactation_df$e_conc_ug)
## W = 0.99144, p-value = 6.802e-05
Data is not normal, so can’t do ANOVA. Let’s use a nonparametric approach to see if E is significantly lower during lactation….it is (W = 98014, p < 0.001)
wilcox.test(log(e_conc_ug) ~ lactating, data = lactation_df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: log(e_conc_ug) by lactating
## W = 98014, p-value = 5.847e-09
## alternative hypothesis: true location shift is not equal to 0
Technically, data should be normal for this. BUT I think our sample might be large enough that we’re okay violating this assumption?
## what other control variables should I include?
unadjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + (1 | ind_id), data = e_data)
adjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + lactating + (1 | ind_id), data = e_data)
anova(unadjusted_lactation_model,adjusted_lactation_model) # adjusted model is better fit--lactation predicts E
## refitting model(s) with ML (instead of REML)
## Data: e_data
## Models:
## unadjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_lactation_model 6 4843.2 4874.7 -2415.6 4831.2
## adjusted_lactation_model 7 4803.3 4840.0 -2394.6 4789.3 41.974 1
## Pr(>Chisq)
## unadjusted_lactation_model
## adjusted_lactation_model 9.25e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_lactation_model) # lactation = lower E
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 |
## ind_id)
## Data: e_data
##
## REML criterion at convergence: 4813.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1108 -0.6274 -0.0233 0.6412 3.3238
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 2.147 1.465
## Residual 1.626 1.275
## Number of obs: 1403, groups: ind_id, 44
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.633063 0.349677 -4.670
## age_at_sample 0.173253 0.020625 8.400
## month 0.061315 0.009774 6.273
## hour -0.022011 0.012923 -1.703
## lactatingYES -0.635455 0.094448 -6.728
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour
## age_at_smpl -0.587
## month -0.188 0.038
## hour -0.426 0.007 0.032
## lactatngYES 0.058 -0.182 -0.047 0.040
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_lactation_model)
## age_at_sample month hour lactating
## 1.035517 1.004192 1.002907 1.038010
# checking model diagnostics
plot(adjusted_lactation_model) #resids vs leverage -- good
lattice::qqmath(adjusted_lactation_model) ## qq norm plot -- good...small tail on right
plot(adjusted_lactation_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Can I please have access to these data?